ggplot[20pt]The following url: “http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv” contains data on fossil fuel emissions.
emissions <- read.csv("http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv",
stringsAsFactors = FALSE)
emissions <- emissions[4:nrow(emissions), ]
head(emissions)
dplyr functions.library(dplyr)
emissionsTotal <- emissions %>%
group_by(Year) %>%
summarise(
world_total =
sum(Total.CO2.emissions.from.fossil.fuels.and.cement.production..thousand.metric.tons.of.C.))
library(ggplot2)
ggplot(emissionsTotal, aes(x = Year, y = world_total/10^6)) +
geom_line() + theme_classic()
toupper(). Add a column ‘co2_emission’ equal to \(CO_2\) emission in Gt, i.e. ‘Total.CO2.emissions.from.fossil.fuels.and.cement.production..thousand.metric.tons.of.C.’/10^6countries <- read.csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv",
stringsAsFactors = FALSE)
countries$NAME <- toupper(countries$name)
head(countries)
We can look for inconsistencies in the countries names, by inspecting the countries names that where not in both data frames:
(not_in_emissions <- setdiff(countries$NAME, emissions$Nation))
[1] "ÅLAND ISLANDS" "AMERICAN SAMOA"
[3] "ANTARCTICA" "ANTIGUA AND BARBUDA"
[5] "BOLIVIA (PLURINATIONAL STATE OF)" "BONAIRE, SINT EUSTATIUS AND SABA"
[7] "BOSNIA AND HERZEGOVINA" "BOUVET ISLAND"
[9] "BRITISH INDIAN OCEAN TERRITORY" "BRUNEI DARUSSALAM"
[11] "CAMEROON" "CABO VERDE"
[13] "CHINA" "COCOS (KEELING) ISLANDS"
[15] "CONGO (DEMOCRATIC REPUBLIC OF THE)" "CÔTE D'IVOIRE"
[17] "CURAÇAO" "FAROE ISLANDS"
[19] "FRANCE" "FRENCH SOUTHERN TERRITORIES"
[21] "GUAM" "GUERNSEY"
[23] "GUINEA-BISSAU" "HEARD ISLAND AND MCDONALD ISLANDS"
[25] "HOLY SEE" "HONG KONG"
[27] "IRAN (ISLAMIC REPUBLIC OF)" "ISLE OF MAN"
[29] "ITALY" "JERSEY"
[31] "KOREA (DEMOCRATIC PEOPLE'S REPUBLIC OF)" "KOREA (REPUBLIC OF)"
[33] "LAO PEOPLE'S DEMOCRATIC REPUBLIC" "LIBYA"
[35] "MACAO" "MACEDONIA (THE FORMER YUGOSLAV REPUBLIC OF)"
[37] "MAYOTTE" "MICRONESIA (FEDERATED STATES OF)"
[39] "MOLDOVA (REPUBLIC OF)" "MONACO"
[41] "MYANMAR" "NORFOLK ISLAND"
[43] "NORTHERN MARIANA ISLANDS" "PALESTINE, STATE OF"
[45] "PITCAIRN" "RÉUNION"
[47] "SAINT BARTHÉLEMY" "SAINT HELENA, ASCENSION AND TRISTAN DA CUNHA"
[49] "SAINT KITTS AND NEVIS" "SAINT MARTIN (FRENCH PART)"
[51] "SAINT PIERRE AND MIQUELON" "SAINT VINCENT AND THE GRENADINES"
[53] "SAN MARINO" "SAO TOME AND PRINCIPE"
[55] "SINT MAARTEN (DUTCH PART)" "SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS"
[57] "SOUTH SUDAN" "SVALBARD AND JAN MAYEN"
[59] "TAIWAN, PROVINCE OF CHINA" "TANZANIA, UNITED REPUBLIC OF"
[61] "TIMOR-LESTE" "TOKELAU"
[63] "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND" "UNITED STATES MINOR OUTLYING ISLANDS"
[65] "VENEZUELA (BOLIVARIAN REPUBLIC OF)" "VIRGIN ISLANDS (BRITISH)"
[67] "VIRGIN ISLANDS (U.S.)" "WALLIS AND FUTUNA"
[69] "WESTERN SAHARA"
(not_in_countries <- setdiff(emissions$Nation, countries$NAME))
[1] "ANTARCTIC FISHERIES" "ANTIGUA & BARBUDA"
[3] "BONAIRE, SAINT EUSTATIUS, AND SABA" "BOSNIA & HERZEGOVINA"
[5] "BRITISH VIRGIN ISLANDS" "BRUNEI (DARUSSALAM)"
[7] "CAPE VERDE" "CHINA (MAINLAND)"
[9] "COTE D IVOIRE" "CURACAO"
[11] "CZECHOSLOVAKIA" "DEMOCRATIC PEOPLE S REPUBLIC OF KOREA"
[13] "DEMOCRATIC REPUBLIC OF THE CONGO (FORMERLY ZAIRE)" "DEMOCRATIC REPUBLIC OF VIETNAM"
[15] "EAST & WEST PAKISTAN" "FAEROE ISLANDS"
[17] "FEDERAL REPUBLIC OF GERMANY" "FEDERATED STATES OF MICRONESIA"
[19] "FEDERATION OF MALAYA-SINGAPORE" "FORMER DEMOCRATIC YEMEN"
[21] "FORMER GERMAN DEMOCRATIC REPUBLIC" "FORMER PANAMA CANAL ZONE"
[23] "FORMER YEMEN" "FRANCE (INCLUDING MONACO)"
[25] "FRENCH EQUATORIAL AFRICA" "FRENCH INDO-CHINA"
[27] "FRENCH WEST AFRICA" "GUINEA BISSAU"
[29] "HONG KONG SPECIAL ADMINSTRATIVE REGION OF CHINA" "ISLAMIC REPUBLIC OF IRAN"
[31] "ITALY (INCLUDING SAN MARINO)" "JAPAN (EXCLUDING THE RUYUKU ISLANDS)"
[33] "KUWAITI OIL FIRES" "LAO PEOPLE S DEMOCRATIC REPUBLIC"
[35] "LEEWARD ISLANDS" "LIBYAN ARAB JAMAHIRIYAH"
[37] "MACAU SPECIAL ADMINSTRATIVE REGION OF CHINA" "MACEDONIA"
[39] "MYANMAR (FORMERLY BURMA)" "NETHERLAND ANTILLES"
[41] "NETHERLAND ANTILLES AND ARUBA" "OCCUPIED PALESTINIAN TERRITORY"
[43] "PACIFIC ISLANDS (PALAU)" "PENINSULAR MALAYSIA"
[45] "PLURINATIONAL STATE OF BOLIVIA" "REPUBLIC OF CAMEROON"
[47] "REPUBLIC OF KOREA" "REPUBLIC OF MOLDOVA"
[49] "REPUBLIC OF SOUTH SUDAN" "REPUBLIC OF SOUTH VIETNAM"
[51] "REPUBLIC OF SUDAN" "REUNION"
[53] "RHODESIA-NYASALAND" "RWANDA-URUNDI"
[55] "RYUKYU ISLANDS" "SABAH"
[57] "SAINT HELENA" "SAINT MARTIN (DUTCH PORTION)"
[59] "SAO TOME & PRINCIPE" "SARAWAK"
[61] "ST. KITTS-NEVIS" "ST. KITTS-NEVIS-ANGUILLA"
[63] "ST. PIERRE & MIQUELON" "ST. VINCENT & THE GRENADINES"
[65] "TAIWAN" "TANGANYIKA"
[67] "TIMOR-LESTE (FORMERLY EAST TIMOR)" "UNITED KINGDOM"
[69] "UNITED KOREA" "UNITED REPUBLIC OF TANZANIA"
[71] "USSR" "VENEZUELA"
[73] "WALLIS AND FUTUNA ISLANDS" "YUGOSLAVIA (FORMER SOCIALIST FEDERAL REPUBLIC)"
[75] "YUGOSLAVIA (MONTENEGRO & SERBIA)" "ZANZIBAR"
Now, for each ‘Nation’ in emissions that is not in the ‘countries’ data-frame we find the country with the closest ‘NAME’ using agrep() function for approximate string matching.
names_in_countries_idx <- sapply(not_in_emissions, function(x) agrep(x, emissions$Nation)[1])
names_in_emissions_idx <- sapply(not_in_countries, function(x) agrep(x, countries$NAME)[1])
countries_names_to_change <- data.frame(
name_in_countries = c(not_in_emissions, countries$NAME[names_in_emissions_idx]),
names_in_emissions = c(emissions$Nation[names_in_countries_idx], not_in_countries),
stringsAsFactors = FALSE) %>%
filter(!is.na(name_in_countries), !is.na(names_in_emissions))
countries_names_to_change
We see that two matching did not work: AUSTRALIA –> USSR, and BONAIRE, SINT EUSTATIUS AND SABA–> SABAH
So we drop them:
countries_names_to_change <- countries_names_to_change %>%
filter(!name_in_countries %in% c("AUSTRALIA", "BONAIRE, SINT EUSTATIUS AND SABA"))
Now, we change the original names in ‘countries’ to new names:
countries$NAME <- sapply(countries$NAME, function(x) {
if(x %in% countries_names_to_change$name_in_countries) {
idx <- grep(x, countries_names_to_change$name_in_countries)[1]
x <- countries_names_to_change$names_in_emissions[idx]
}
return(x)
})
df <- emissions %>%
left_join(countries, by = c("Nation" = "NAME")) %>%
mutate(co2_emission = Total.CO2.emissions.from.fossil.fuels.and.cement.production..thousand.metric.tons.of.C./10^6)
df$sub.region[df$sub.region == ""] <- NA
head(df)
dplyr to compute total annual \(CO_2\) (‘co2_emission’) emission per ‘sub.region’.df <- df %>%
group_by(Year, sub.region) %>%
summarise(co2_emission = sum(co2_emission))
head(df)
ggplot to generate a stacked density plot the annual \(CO_2\) (in giga tonnes) by world regions (“sub.region”). Your plot should resemble something like this (but with other regional categories, and slightly different values). Hint: use geom_area() function with suitable parameters. Which region seems to produce most \(CO_2\)? You might like to modify the color scheme to better distinguish regions.library(RColorBrewer)
set.seed(89475)
cols <- sample(colorRampPalette(brewer.pal(12, "Set3"))(23))
ggplot(df, aes(x = Year, y = co2_emission)) +
geom_area(aes(fill = sub.region), position = "stack") +
scale_fill_manual(na.value = "grey50", values = cols) +
theme_classic() + theme(legend.position = "bottom")
In this exercise we will use the DNA microarray gene expression data. You can read more about it on page 5 of “The Elements of Statistical Learning”.
microarray <- read.table("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/nci.data")
info <- read.table("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/nci.info.txt",
skip = 12)
colnames(microarray) <- info$V1
microarray[1:5, 1:5]
In the ‘microarray’ matrix columns correspond to samples, and rows to genes.
order() to find the indices). Then, plot a heatmap without clustering/dendrograms. You can use ‘asp = 0.2’ argument to change the aspect ratio of the heatmap.idx <- order(apply(microarray,1, sd), decreasing = TRUE)[1:500]
heatmap(as.matrix(microarray)[idx, ], Rowv = NA,
Colv = NA, asp=0.2, scale = "none")
redgreen <- c("#FF0000", "#DB0000", "#B60000", "#920000", "#6D0000",
"#490000", "#240000", "#000000", "#002400", "#004900",
"#006D00", "#009200", "#00B600", "#00DB00", "#00FF00")
heatmap(as.matrix(microarray)[idx, ], Rowv = NA, Colv = NA, asp=0.2,
col = redgreen, scale = "none")
Now, plot the same graph but with dendrogram for rows (rows clustering).
heatmap(as.matrix(microarray)[idx, ], Colv = NA, asp=0.2,
col = redgreen, scale = "none")
heatmaply() to generate the previous plot. You might want to add a command similar to the following %>% layout(margin = list(l = 150, b = 350), autosize = F, width = 600, height = 800) to the plot to set margins and to resize it.library(heatmaply)
heatmaply(data.frame(microarray[idx, ]), Colv = FALSE, colors = redgreen,
scale = "none", margin = list(l = 150, b = 600),
width = 800, height = 1000, autosize = FALSE)
It seems like certain groups of patients have sertain groups of genes up or down regulated, specifically melanoma and colon cacer patients seem to have certain “blocks of green”, which means that specific group of genes have increased expression in these type of patients.
Recall the movies data-frame we used in for lecture 3 exercises. It contains information on movies from the last three decates, which was scrapped from the IMDB database.
url <- "https://raw.githubusercontent.com/Juanets/movie-stats/master/movies.csv"
movies <- tbl_df(read.csv(url))
movies
ggplot(movies %>% filter(genre %in% c("Action", "Comedy")),
aes(x = genre, y = runtime)) +
geom_jitter(height = 0,alpha = 0.2, color = "grey30") +
geom_boxplot(lwd = 1, color = "darkgreen", fill = NA) +
theme_classic()
t.test(formula = runtime ~ genre,
data = movies %>%
filter(genre %in% c("Action", "Comedy")),
alternative = "greater")
Welch Two Sample t-test
data: runtime by genre
t = 14.094, df = 2120.7, p-value < 2.2e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
6.881018 Inf
sample estimates:
mean in group Action mean in group Comedy
109.0008 101.2101
Yes, the test showed that there is enough eveidence to reject the null hypothesis, and the action movies have higher mean length than the comedies.
advertising <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv",
row.names = 1)
head(advertising)
ggplot(advertising, aes(x = TV, y = sales)) +
geom_point() + geom_smooth(method = "lm") +
theme_classic()
library(plotly)
plot_ly(data = advertising, x = ~TV, y = ~radio, z = ~sales,
type = "scatter3d", mode = "markers", marker = list(size = 4))
sample(1:200, n = 150) to randomly choose row indices of the advertising dataset to include in the train set. The remaining indices should be used for the test set. Remember to choose and set the seed for randomization!set.seed(12345)
idx <- sample(1:nrow(advertising), 150)
train <- advertising[idx, ]
test <- advertising[-idx, ]
\[RMSE = \sqrt{\frac{1}{n} \sum\limits_{i = 1}^n(\hat y_i - y_i)^2}\]
fit1 <- lm(sales ~ TV, data = train)
summary(fit1)
Call:
lm(formula = sales ~ TV, data = train)
Residuals:
Min 1Q Median 3Q Max
-7.5442 -1.9976 -0.0122 1.9375 7.4951
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.236487 0.529751 13.66 <2e-16 ***
TV 0.045090 0.003164 14.25 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.266 on 148 degrees of freedom
Multiple R-squared: 0.5785, Adjusted R-squared: 0.5756
F-statistic: 203.1 on 1 and 148 DF, p-value: < 2.2e-16
yhat <- predict(fit1, test)
(rmse1 <- sqrt(mean((test$sales - yhat)^2)))
[1] 3.27969
fit2 <- lm(sales ~ TV + radio + newspaper, data = train)
summary(fit2)
Call:
lm(formula = sales ~ TV + radio + newspaper, data = train)
Residuals:
Min 1Q Median 3Q Max
-8.7941 -0.9224 0.2934 1.1735 2.5006
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.796684 0.365948 7.642 2.61e-12 ***
TV 0.045314 0.001624 27.908 < 2e-16 ***
radio 0.190504 0.009962 19.124 < 2e-16 ***
newspaper 0.002500 0.006671 0.375 0.708
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.676 on 146 degrees of freedom
Multiple R-squared: 0.8905, Adjusted R-squared: 0.8883
F-statistic: 395.9 on 3 and 146 DF, p-value: < 2.2e-16
yhat <- predict(fit2, test)
(rmse2 <- sqrt(mean((test$sales - yhat)^2)))
[1] 1.728615
The new model seems to improve the prediction, by decresing the test error.
fit3 <- lm(sales ~ TV + radio, data = train)
summary(fit3)
Call:
lm(formula = sales ~ TV + radio, data = train)
Residuals:
Min 1Q Median 3Q Max
-8.8650 -0.9098 0.2910 1.1932 2.5064
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.840365 0.345867 8.212 1.02e-13 ***
TV 0.045326 0.001619 28.002 < 2e-16 ***
radio 0.191739 0.009373 20.457 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.671 on 147 degrees of freedom
Multiple R-squared: 0.8904, Adjusted R-squared: 0.8889
F-statistic: 597.3 on 2 and 147 DF, p-value: < 2.2e-16
yhat <- predict(fit3, test)
(rmse3 <- sqrt(mean((test$sales - yhat)^2)))
[1] 1.721141
The last model has a slightly lower test error, but not by much. However, it contains fewer variables, so as a simpler model should be preferable.
We load the following datsets including characteristics of emails and spams:
`word_freq_WORD = percentage of words in the e-mail that match WORD.
char_freq_CHAR = percentage of characters in the e-mail that match CHAR,
1 continuous real [1,…] attribute of type capital_run_length_average = average length of uninterrupted sequences of capital letters
1 continuous integer [1,…] attribute of type
capital_run_length_longest = length of longest uninterrupted sequence of capital letters
capital_run_length_total = sum of length of uninterrupted sequences of capital letters = = total number of capital letters in the e-mail
spam = denotes whether the e-mail was considered spam (1) or not (0),
url.info <- "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names"
spam.info <- read.table(url.info, comment.char = "|", skip = 32, stringsAsFactors = FALSE)
attributes <- gsub(":", "", spam.info[[1]])
attributes[49: 54]
[1] "char_freq_;" "char_freq_(" "char_freq_[" "char_freq_!" "char_freq_$" "char_freq_#"
symbols <- c("semicolon", "left.parenthesis", "left.sq.bracket", "exclamation",
"dollar", "hashtag")
attributes[49: 54] <- paste0("char_freq_", symbols)
attributes[49: 54]
[1] "char_freq_semicolon" "char_freq_left.parenthesis" "char_freq_left.sq.bracket" "char_freq_exclamation"
[5] "char_freq_dollar" "char_freq_hashtag"
url.data <- "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
spam <- read.csv(url.data, header = FALSE, stringsAsFactors = FALSE)
colnames(spam) <- c(attributes, "spam")
spam <- spam %>%
mutate(spam = factor(spam, levels = c(0, 1), labels = c("email", "spam")))
head(spam)
table(spam$spam)
email spam
2788 1813
ggplot(spam, aes(x = spam)) + geom_bar() +
theme_classic()
set.seed(27846)
train.idx <- sample(nrow(spam), 0.6*nrow(spam))
train <- spam[train.idx, ]
test <- spam[-train.idx, ]
spam.logit <- glm(spam ~ ., train, family = "binomial")
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(spam.logit)
Call:
glm(formula = spam ~ ., family = "binomial", data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-4.3845 -0.1827 0.0000 0.0977 4.1522
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.466e+00 1.829e-01 -8.013 1.12e-15 ***
word_freq_make -2.558e-01 2.855e-01 -0.896 0.370294
word_freq_address -1.480e-01 8.841e-02 -1.674 0.094203 .
word_freq_all 2.323e-01 1.553e-01 1.496 0.134686
word_freq_3d 2.940e+00 2.159e+00 1.362 0.173186
word_freq_our 7.941e-01 1.614e-01 4.921 8.60e-07 ***
word_freq_over 1.377e+00 3.651e-01 3.772 0.000162 ***
word_freq_remove 2.275e+00 4.265e-01 5.333 9.64e-08 ***
word_freq_internet 2.620e-01 1.952e-01 1.342 0.179517
word_freq_order 3.813e-01 4.435e-01 0.860 0.389916
word_freq_mail 8.650e-02 8.379e-02 1.032 0.301882
word_freq_receive -1.050e-01 3.548e-01 -0.296 0.767296
word_freq_will -2.777e-01 1.088e-01 -2.553 0.010671 *
word_freq_people -1.151e-01 2.981e-01 -0.386 0.699404
word_freq_report 2.987e-01 2.197e-01 1.360 0.173915
word_freq_addresses 9.744e-01 7.507e-01 1.298 0.194314
word_freq_free 9.939e-01 1.737e-01 5.723 1.05e-08 ***
word_freq_business 1.240e+00 3.144e-01 3.945 8.00e-05 ***
word_freq_email 1.112e-02 1.717e-01 0.065 0.948345
word_freq_you 7.141e-02 4.658e-02 1.533 0.125232
word_freq_credit 4.966e-01 4.365e-01 1.138 0.255260
word_freq_your 1.492e-01 6.871e-02 2.172 0.029842 *
word_freq_font 1.590e-01 2.066e-01 0.770 0.441347
word_freq_000 1.834e+00 5.483e-01 3.345 0.000823 ***
word_freq_money 4.859e-01 2.594e-01 1.873 0.061053 .
word_freq_hp -1.823e+00 3.699e-01 -4.928 8.29e-07 ***
word_freq_hpl -5.645e-01 4.500e-01 -1.254 0.209693
word_freq_george -1.629e+01 3.723e+00 -4.377 1.20e-05 ***
word_freq_650 4.805e-01 2.042e-01 2.353 0.018603 *
word_freq_lab -1.127e+01 9.501e+00 -1.186 0.235458
word_freq_labs -2.987e-01 3.578e-01 -0.835 0.403794
word_freq_telnet -1.310e-01 3.640e-01 -0.360 0.718857
word_freq_857 3.463e+00 3.336e+00 1.038 0.299123
word_freq_data -6.396e-01 3.539e-01 -1.807 0.070728 .
word_freq_415 2.162e+00 3.149e+00 0.687 0.492356
word_freq_85 -2.683e+00 1.499e+00 -1.789 0.073548 .
word_freq_technology 4.759e-01 3.937e-01 1.209 0.226799
word_freq_1999 -3.859e-01 3.898e-01 -0.990 0.322174
word_freq_parts -8.463e-01 2.030e+00 -0.417 0.676818
word_freq_pm -8.690e-01 6.298e-01 -1.380 0.167655
word_freq_direct -3.246e-01 4.979e-01 -0.652 0.514486
word_freq_cs -3.608e+01 3.266e+01 -1.105 0.269231
word_freq_meeting -5.285e+00 3.069e+00 -1.722 0.085022 .
word_freq_original -1.137e+00 1.148e+00 -0.991 0.321647
word_freq_project -1.053e+00 5.161e-01 -2.041 0.041285 *
word_freq_re -9.118e-01 2.188e-01 -4.167 3.08e-05 ***
word_freq_edu -1.829e+00 3.822e-01 -4.785 1.71e-06 ***
word_freq_table -1.877e+00 1.651e+00 -1.137 0.255589
word_freq_conference -4.467e+00 2.570e+00 -1.738 0.082238 .
char_freq_semicolon -1.194e+00 5.381e-01 -2.218 0.026525 *
char_freq_left.parenthesis -8.071e-01 5.521e-01 -1.462 0.143767
char_freq_left.sq.bracket -5.058e-01 9.056e-01 -0.559 0.576486
char_freq_exclamation 2.613e-01 7.460e-02 3.503 0.000460 ***
char_freq_dollar 8.337e+00 1.130e+00 7.380 1.59e-13 ***
char_freq_hashtag 2.449e+00 1.690e+00 1.449 0.147334
capital_run_length_average 2.230e-02 3.055e-02 0.730 0.465287
capital_run_length_longest 1.178e-02 3.638e-03 3.238 0.001205 **
capital_run_length_total 6.671e-04 2.869e-04 2.325 0.020067 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3692.0 on 2759 degrees of freedom
Residual deviance: 1052.9 on 2702 degrees of freedom
AIC: 1168.9
Number of Fisher Scoring iterations: 13
spam.prob.logit <- predict(spam.logit, test, type = "response")
spam.pred.logit <- factor(spam.prob.logit < 0.5, levels = c(TRUE, FALSE),
labels = c("email", "spam"))
(conf.mat.logit <- table(pred = spam.pred.logit, true = test$spam))
true
pred email spam
email 1057 76
spam 48 660
(accuracy.logit <- sum(diag(conf.mat.logit))/nrow(test))
[1] 0.9326453
There were 20 features coefficients significant at level \(\alpha = 0.05\). The logistic model seems to work well with accuracy 0.9326453.
library(randomForest)
spam.rf <- randomForest(spam ~ ., data = train, importance = TRUE)
spam.rf
Call:
randomForest(formula = spam ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 7
OOB estimate of error rate: 5.18%
Confusion matrix:
email spam class.error
email 1635 48 0.02852050
spam 95 982 0.08820799
varImpPlot(spam.rf)
mean.decrese.accuracy <- importance(spam.rf) %>%
as.data.frame() %>%
tibble::rownames_to_column("feature") %>%
select(feature, MeanDecreaseAccuracy) %>%
arrange(desc(MeanDecreaseAccuracy))
mean.decrese.accuracy
mean.decrese.gini <- importance(spam.rf) %>%
as.data.frame() %>%
tibble::rownames_to_column("feature") %>%
select(feature, MeanDecreaseGini) %>%
arrange(desc(MeanDecreaseGini))
mean.decrese.gini
intersect(mean.decrese.accuracy$feature[1:10], mean.decrese.gini$feature[1:10])
[1] "char_freq_exclamation" "capital_run_length_average" "word_freq_free" "char_freq_dollar"
[5] "word_freq_remove" "word_freq_hp" "capital_run_length_longest" "capital_run_length_total"
[9] "word_freq_your"
The variable importance measures indicate that explamation marks, and dollar signs, as well as the frequency of words such as “free”, “remove”, “hp”, “your” as well as the length of sequences of capital letters are predictive of whether a piece of text is a spam.
spam.pred.rf <- predict(spam.rf, test)
(conf.mat.rf <- table(pred = spam.pred.rf, true = test$spam))
true
pred email spam
email 1067 45
spam 38 691
(accuracy.rf <- sum(diag(conf.mat.rf))/nrow(test))
[1] 0.9549158
It seems like the random forest has a slight edge compared to the logistic regression model, with test accuracy of 0.95 vs 0.93. In general, for easy problems, random forest performs well as a black box predictor without much tuning or a prior variable selection.
ggmap [10pt]from <- c(lon = -122.169719, lat = 37.4274745) and to <- c(lon = -122.16242, loc = 37.44457). Create a vector for the bounding box of the two locations bbox <- c(left = longitude.from, bottom = latitude.from, right = longitude.to, top = latitude.to) with appropriate values filled in. Then, use the get_map() and ggmap to generate a map containing the two locations. Use source: “google”, maptype = “satellite” and a city-level-zoom, zoom = 15.library(ggmap)
from <- c(lon = -122.169719, lat = 37.4274745)
to <- c(lon = -122.16242, loc = 37.44457)
bbox <- c(left = from[[1]], bottom = from[[2]],
right = to[[1]], top = to[[2]])
myMap <- get_map(location = bbox, crop = TRUE, zoom = 15,
source = "google", maptype="satellite")
map <- ggmap(myMap)
map
route() from the ggmap to generate a data-frame corresponding to the route from one location to the other. Then, use geom_path() to add the path corresponding to the route generated with the route() function. You can use a function revgeocode() to look up the addresses of the from and to locations.route_df <- route(from, to, structure = "route")
map <- map +
geom_path(data = route_df,
aes(x = lon, y = lat), colour = "red", size = 1.5)
map
revgeocode(from)
[1] "450 Serra Mall, Stanford, CA 94305, USA"
revgeocode(to)
[1] "547 Emerson St, Palo Alto, CA 94301, USA"
You just mapped a route from Stanford to the only pub in town!
url.housing <- "https://raw.githubusercontent.com/simonkassel/Visualizing_SF_home_prices_R/master/Data/SF_home_sales_demo_data.csv"
sf.housing <- read.csv(url.housing, row.names = 1)
head(sf.housing)
Use get_map() and ggmap() to plot a map of San Francisco using source = “google”, maptype = “toner-lite” and zoom = 13. Then, use the data on housing prices above to add a layer of points colored by the SalePrice (use a good color scheme).
library(viridis)
sf.map.data <- get_map(location = geocode("San Francisco"),
zoom = 13, maptype = "toner-lite")
sf.map <- ggmap(sf.map.data)
(sf.map <- sf.map +
geom_point(data = sf.housing, aes(color = SalePrice, x = long, y = lat)) +
scale_color_viridis())
sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.3 LTS
Matrix products: default
BLAS: /usr/lib/libblas/libblas.so.3.6.0
LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggmap_2.6.1 randomForest_4.6-12 heatmaply_0.9.1 viridis_0.4.0 viridisLite_0.2.0 plotly_4.7.1
[7] RColorBrewer_1.1-2 bindrcpp_0.2 ggplot2_2.2.1.9000 dplyr_0.7.4.9000
loaded via a namespace (and not attached):
[1] httr_1.2.1 maps_3.1.1 tidyr_0.6.3 jsonlite_1.5 foreach_1.4.3 gtools_3.5.0
[7] shiny_1.0.5 assertthat_0.2.0 sp_1.2-4 stats4_3.4.2 yaml_2.1.14 robustbase_0.92-7
[13] backports_1.0.5 lattice_0.20-35 glue_1.1.1 digest_0.6.12 colorspace_1.3-2 htmltools_0.3.6
[19] httpuv_1.3.5 plyr_1.8.4 pkgconfig_2.0.1 purrr_0.2.2.2 xtable_1.8-2 mvtnorm_1.0-6
[25] scales_0.5.0.9000 gdata_2.17.0 whisker_0.3-2 jpeg_0.1-8 tibble_1.3.4 withr_2.1.0.9000
[31] nnet_7.3-12 lazyeval_0.2.1 proto_1.0.0 magrittr_1.5 mime_0.5 mclust_5.2.3
[37] evaluate_0.10.1 MASS_7.3-47 gplots_3.0.1 class_7.3-14 tools_3.4.2 registry_0.3
[43] data.table_1.10.4 geosphere_1.5-5 RgoogleMaps_1.4.1 trimcluster_0.1-2 stringr_1.2.0 kernlab_0.9-25
[49] munsell_0.4.3 cluster_2.0.6 fpc_2.1-10 compiler_3.4.2 caTools_1.17.1 rlang_0.1.2
[55] grid_3.4.2 iterators_1.0.8 rjson_0.2.15 htmlwidgets_0.9 crosstalk_1.0.0 base64enc_0.1-3
[61] rmarkdown_1.6 bitops_1.0-6 labeling_0.3 gtable_0.2.0 codetools_0.2-15 flexmix_2.3-14
[67] TSP_1.1-5 reshape2_1.4.2 R6_2.2.2 seriation_1.2-2 gridExtra_2.2.1 knitr_1.17
[73] prabclus_2.2-6 rprojroot_1.2 bindr_0.1 KernSmooth_2.23-15 dendextend_1.5.2 modeltools_0.2-21
[79] stringi_1.1.5 Rcpp_0.12.13 mapproj_1.2-4 png_0.1-7 gclus_1.3.1 DEoptimR_1.0-8
[85] diptest_0.75-7